library(Biostrings)
library(gplots)
## Warning: package 'gplots' was built under R version 3.2.4
library(RColorBrewer)
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.2.4
library(reshape2)

Get fasta data

Tn019.fasta <- readDNAStringSet('Tn019_S2_L001_redundans/cdhit1000.fa')

Dinucleotide frequency clustering

tet <- oligonucleotideFrequency(Tn019.fasta, 2, as.prob = T)

rownames(tet) <- names(Tn019.fasta)

cl <- hclust(dist(tet), method='ward.D2')
plot(as.dendrogram(cl), leaflab='none')

#hmcol <- rev(colorRampPalette(brewer.pal(9, "YlGnBu"))(255))
#lab.col <- colorRampPalette(brewer.pal(9, 'RdPu'))(255)
#scaled.sizes <- trunc(width(Tn019.fasta)/max(width(Tn019.fasta))*255)

#heatmap.2(tet, dendrogram = 'row', Rowv = as.dendrogram(cl), trace = 'none', labRow = '', scale='row', col=hmcol, lhei = c(0.5, 4), RowSideColors = lab.col[scaled.sizes])

PCA plot - how well does clustering match up with this?

cl.split <- cutree(cl, 7)
table(cl.split)
## cl.split
##    1    2    3    4    5    6    7 
## 1249 1155  882  841  720  752   72
tet.pca <- prcomp(tet, center = T, scale. = T)
plot(tet.pca, type='l')

scores <- data.frame(rownames(tet), tet.pca$x[,1:3], length=width(Tn019.fasta), cluster=cl.split)
qplot(x=PC1, y=PC2, data=scores, col=as.factor(cluster))

qplot(x=PC1, y=PC3, data=scores, col=as.factor(cluster))

qplot(x=PC2, y=PC3, data=scores, col=as.factor(cluster))

Map dinucleotide frequency data onto contig coverage and SNP rate

fullSummary <- read.delim('Varscan.CovSNPrate.summary.tab', header=T)
clusterData <- data.frame(Chrom = gsub('\\|size[0-9]+', '', names(Tn019.fasta)),
                          cluster = cl.split)

allData <- merge(subset(fullSummary, Sample=='Tn019'), clusterData, by = 'Chrom', all.x=T)
allData[is.na(allData$cluster),'cluster'] <- 0

#Coverage distributions
ggplot(allData, aes(x=medCov)) + geom_histogram(binwidth=5, alpha=0.2, col='black') + scale_x_continuous(limits = c(0, 150)) + geom_freqpoly(aes(x=medCov, col=as.factor(cluster)), binwidth=5, size=2)
## Warning: Removed 173 rows containing non-finite values (stat_bin).

## Warning: Removed 173 rows containing non-finite values (stat_bin).
## Warning: Removed 14 rows containing missing values (geom_path).

#SNP rate distrubutions
ggplot(allData, aes(x=SNPrate*100)) + geom_histogram(binwidth=0.5, alpha=0.2, col='black') + geom_freqpoly(aes(x=SNPrate*100, col=as.factor(cluster)), binwidth=0.5, size=2) + scale_x_continuous(limits = c(0,10))
## Warning: Removed 53 rows containing non-finite values (stat_bin).
## Warning: Removed 53 rows containing non-finite values (stat_bin).
## Warning: Removed 14 rows containing missing values (geom_path).

# Load SNP-only dataset
snpSummary <- read.delim('Tn019_S2_L001_v_cdhit.VarScanSNPs.tab', header=T)

snpSummary <- cbind(colsplit(snpSummary$Chrom, '\\|size', c("Chrom", "ContigLength")), snpSummary[2:19])

snpSummary$VarFreq <- as.numeric(gsub('%','',snpSummary$VarFreq))

snpSummary['MinorAlleleFreq'] <- ifelse(snpSummary$VarFreq <= 50, snpSummary$VarFreq, 100-snpSummary$VarFreq)

# Merge with cluster data (keep only SNPs that are observed on both strands)
allSNPs <- merge(subset(snpSummary, Strands1 == 2 & Strands2 == 2), clusterData, by = 'Chrom', all.x=T)

ggplot(allSNPs, aes(x=MinorAlleleFreq)) + geom_histogram(binwidth=1, alpha=0.2, col='black') + geom_freqpoly(aes(x=MinorAlleleFreq, col=as.factor(cluster)), binwidth=1, size=2)

Repeat using tetranucleotide frequency

tet <- oligonucleotideFrequency(Tn019.fasta, 4, as.prob = T)
rownames(tet) <- names(Tn019.fasta)

cl <- hclust(dist(tet), method='ward.D2')
plot(as.dendrogram(cl), leaflab='none')

cl.split <- cutree(cl, 5)
table(cl.split)
## cl.split
##    1    2    3    4    5 
## 1385 1225 1279 1102  680
tet.pca <- prcomp(tet, center = T, scale. = T)
plot(tet.pca, type='l')

scores <- data.frame(rownames(tet), tet.pca$x[,1:3], length=width(Tn019.fasta), cluster=cl.split)
qplot(x=PC1, y=PC2, data=scores, col=as.factor(cluster))

qplot(x=PC1, y=PC3, data=scores, col=as.factor(cluster))

qplot(x=PC2, y=PC3, data=scores, col=as.factor(cluster))

clusterData <- data.frame(Chrom = gsub('\\|size[0-9]+', '', names(Tn019.fasta)),
                          cluster = cl.split)

allData <- merge(subset(fullSummary, Sample=='Tn019'), clusterData, by = 'Chrom', all.x=T)
allData[is.na(allData$cluster),'cluster'] <- 0

#Coverage distributions
ggplot(allData, aes(x=medCov)) + geom_histogram(binwidth=5, alpha=0.2, col='black') + scale_x_continuous(limits = c(0, 150)) + geom_freqpoly(aes(x=medCov, col=as.factor(cluster)), binwidth=5, size=2)
## Warning: Removed 173 rows containing non-finite values (stat_bin).

## Warning: Removed 173 rows containing non-finite values (stat_bin).
## Warning: Removed 10 rows containing missing values (geom_path).

#SNP rate distrubutions
ggplot(allData, aes(x=SNPrate*100)) + geom_histogram(binwidth=0.5, alpha=0.2, col='black') + geom_freqpoly(aes(x=SNPrate*100, col=as.factor(cluster)), binwidth=0.5, size=2) + scale_x_continuous(limits = c(0,10))
## Warning: Removed 53 rows containing non-finite values (stat_bin).
## Warning: Removed 53 rows containing non-finite values (stat_bin).
## Warning: Removed 10 rows containing missing values (geom_path).

# Merge with cluster data (keep only SNPs that are observed on both strands)
allSNPs <- merge(subset(snpSummary, Strands1 == 2 & Strands2 == 2), clusterData, by = 'Chrom', all.x=T)

ggplot(allSNPs, aes(x=MinorAlleleFreq)) + geom_histogram(binwidth=1, alpha=0.2, col='black') + geom_freqpoly(aes(x=MinorAlleleFreq, col=as.factor(cluster)), binwidth=1, size=2)